practice: data cleaning¶

In [2]:
# import my library /تحميل المكتبات التي سوف يتم العمل بها

import pandas as pd
import numpy as np
import plotly.express as px
from matplotlib import pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = None
pd.options.display.max_columns =None
In [3]:
#wrangle function / انشاء فانكشن لتنظيف البيانات

def wrangle(filename, encoding = None,  dropna_columns_name=[None] ,rename_colums =None ,date_column=None,remove_duplicated_rows_columnsname=None ):
    #loding Data from csv file
    df=pd.read_csv(filename ,encoding = encoding) 
    
    # delet non rows
    df.dropna(subset=dropna_columns_name ,inplace=True)
    
    #Rename columns= {old:new}
    df.rename(columns=rename_colums, inplace=True)
    
    #chang orderdate and id colimns type to data integer
    df[date_column] =pd.to_datetime(df[date_column])

    df= df.drop_duplicates(subset=remove_duplicated_rows_columnsname)
    
    #strip and lowercase columns names
    df.columns = df.columns.str.strip().str.lower()
    
    
    
    return df

Issues:¶

  • Delete orders with NaN values
  • Rename City.1, Country.1, Phon.1 Columns
  • Change OrderDate Type into Date
  • Change ID Columns, and OrderNumber column to integer
In [5]:
# run my function
orders = wrangle(filename = "all_data.csv" ,encoding="iso-8859-1" ,dropna_columns_name=["OrderID"],rename_colums={"City.1":"suppcity","Country.1":"suppCountry","Phone.1":"suppphone"},date_column="OrderDate",remove_duplicated_rows_columnsname=["OrderID"])
orders.info()
<class 'pandas.core.frame.DataFrame'>
Index: 830 entries, 0 to 2130
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   customerid      830 non-null    float64       
 1   firstname       830 non-null    object        
 2   lastname        830 non-null    object        
 3   city            830 non-null    object        
 4   country         830 non-null    object        
 5   phone           830 non-null    object        
 6   orderid         830 non-null    float64       
 7   orderdate       830 non-null    datetime64[ns]
 8   ordernumber     830 non-null    float64       
 9   totalamount     830 non-null    float64       
 10  productid       830 non-null    float64       
 11  productname     830 non-null    object        
 12  unitprice       830 non-null    float64       
 13  package         830 non-null    object        
 14  isdiscontinued  830 non-null    object        
 15  supplierid      830 non-null    float64       
 16  companyname     830 non-null    object        
 17  contactname     830 non-null    object        
 18  suppcity        830 non-null    object        
 19  suppcountry     830 non-null    object        
 20  suppphone       830 non-null    object        
 21  fax             318 non-null    object        
dtypes: datetime64[ns](1), float64(7), object(14)
memory usage: 149.1+ KB
In [ ]:
 
In [6]:
# Change ID Columns, and OrderNumber column to integer

for i in orders.columns:
    if "id" in i or "ordernumber" in i:
        orders[i] = orders[i].astype(int)

orders=orders[["customerid","firstname","lastname","city","country","orderid","orderdate","ordernumber","totalamount"]]
orders.info()
<class 'pandas.core.frame.DataFrame'>
Index: 830 entries, 0 to 2130
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   customerid   830 non-null    int32         
 1   firstname    830 non-null    object        
 2   lastname     830 non-null    object        
 3   city         830 non-null    object        
 4   country      830 non-null    object        
 5   orderid      830 non-null    int32         
 6   orderdate    830 non-null    datetime64[ns]
 7   ordernumber  830 non-null    int32         
 8   totalamount  830 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int32(3), object(4)
memory usage: 55.1+ KB

EDA¶

Univariate Analysis (p1)¶

In [8]:
# quantitative data
# describe totalamount column
orders.totalamount.describe()
Out[8]:
count      830.00
mean     1,631.88
std      1,990.61
min         12.50
25%        480.00
50%      1,015.90
75%      2,028.65
max     17,250.00
Name: totalamount, dtype: float64
In [9]:
orders.head(10)
Out[9]:
customerid firstname lastname city country orderid orderdate ordernumber totalamount
0 85 Paul Henriot Reims France 1 2012-07-04 542378 440.00
3 79 Karin Josephs Münster Germany 2 2012-07-05 542379 1,863.40
5 34 Mario Pontes Rio de Janeiro Brazil 3 2012-07-08 542380 1,813.00
8 84 Mary Saveley Lyon France 4 2012-07-08 542381 670.80
11 76 Pascale Cartrain Charleroi Belgium 5 2012-07-09 542382 3,730.00
14 34 Mario Pontes Rio de Janeiro Brazil 6 2012-07-10 542383 1,444.80
17 14 Yang Wang Bern Switzerland 7 2012-07-11 542384 625.20
20 68 Michael Holz Genève Switzerland 8 2012-07-12 542385 2,490.50
24 88 Paula Parente Resende Brazil 9 2012-07-15 542386 517.80
26 35 Carlos Hern?ndez San Crist?bal Venezuela 10 2012-07-16 542387 1,119.90
In [11]:
# We need to know why mean > median, using histogram to know the shape of data
sns.histplot(data = orders , x='totalamount',kde=True)
#---------------------------------
plt.title("distribution of totalamount of invoices")
plt.xlabel("totalamount")
plt.ylabel("frequercy")
#---------------------------------
plt.axvline(orders.totalamount.mean() ,color='red' )
plt.axvline(orders.totalamount.median() , color="black")
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
Out[11]:
<matplotlib.lines.Line2D at 0x19496a38d50>
No description has been provided for this image
In [12]:
min = orders.totalamount.min()
q1=  orders.totalamount.quantile(.25)
q2=   orders.totalamount.median()
q3= orders.totalamount.quantile(.75)
iqr= q3-q1
lb=q1-(1.5*iqr)
ub=q3+(1.5*iqr)
In [13]:
print("min=" , min , "q1=" , q1 ,"q3=" ,"q2=",q2, q3 , "iqr= ",iqr ,"lb=",lb ,  "ub =",ub)
min= 12.5 q1= 480.0 q3= q2= 1015.9 2028.65 iqr=  1548.65 lb= -1842.9750000000004 ub = 4351.625
In [14]:
orders.select_dtypes('object').nunique()
Out[14]:
firstname    84
lastname     88
city         69
country      21
dtype: int64
In [31]:
# Boxplot to know the 5 number summary
px.box(data_frame=orders , x='totalamount')
In [33]:
orders=orders[orders.totalamount < 15000]
In [35]:
px.box(data_frame=orders , x='totalamount')

Bivariate analysis P1¶

countres & totalamount¶

cites & totalamount¶

In [38]:
px.bar(data_frame=orders.city.value_counts() , title="Distribution of cityes")
In [40]:
px.bar(data_frame=orders.country.value_counts() , title="Distribution of countrys")
In [42]:
px.pie(data_frame=orders, names="country")

through the preliminary analysis , we see that london is the city that has largest share of lagest share of purchasing our products with 46 invoices ,and that the united states and germany are the most purchasing countries for our products

نري من خلال التحليل الاول للبيانات ان لاندن هيا التي تملك الحصه الاكبر من الشراء بواقع 46 فاتوره ,وان الويلايت المتحده و المانيا هم اكثر الدول شراء لمنتجات بواقع 121 فتوره بنسبه 14.6% من اجمالي مبيعاتنا

In [44]:
orders.head(5)
Out[44]:
customerid firstname lastname city country orderid orderdate ordernumber totalamount
0 85 Paul Henriot Reims France 1 2012-07-04 542378 440.00
3 79 Karin Josephs Münster Germany 2 2012-07-05 542379 1,863.40
5 34 Mario Pontes Rio de Janeiro Brazil 3 2012-07-08 542380 1,813.00
8 84 Mary Saveley Lyon France 4 2012-07-08 542381 670.80
11 76 Pascale Cartrain Charleroi Belgium 5 2012-07-09 542382 3,730.00
In [46]:
orders.groupby("country",as_index=False)['totalamount'].sum().sort_values("totalamount",ascending=False)
Out[46]:
country totalamount
19 USA 247,245.08
8 Germany 227,390.63
1 Austria 139,496.63
3 Brazil 99,158.48
7 France 85,498.76
20 Venezuela 60,814.89
18 UK 60,616.51
16 Sweden 59,523.70
9 Ireland 57,317.39
4 Canada 55,334.10
2 Belgium 35,134.98
5 Denmark 34,782.25
17 Switzerland 32,919.50
11 Mexico 24,073.45
6 Finland 19,778.45
15 Spain 19,431.89
10 Italy 16,705.15
14 Portugal 12,468.65
0 Argentina 8,119.10
12 Norway 5,735.15
13 Poland 3,531.95
In [48]:
# plots funtion
def plots(coulumname, top10=False):
    prop=orders.groupby(coulumname ,as_index=False)["totalamount"].sum()
    prop["proporiton"]=(prop.totalamount / orders.totalamount.sum()) * 100
    prop=prop.sort_values("proporiton" , ascending=False)
    print("describe of totalamount ", prop.totalamount.describe())
    if top10 ==  True :
        prop = prop.head(10)

        
    fig1= px.bar(data_frame=prop , x=coulumname ,y="totalamount")
    fig2= px.pie(data_frame=prop , names=coulumname , values="totalamount")
    
    fig2.show()
    fig1.show()
    

country vs totalamount¶

In [50]:
plots("country")
describe of totalamount  count        21.00
mean     62,146.51
std      67,460.19
min       3,531.95
25%      19,431.89
50%      35,134.98
75%      60,814.89
max     247,245.08
Name: totalamount, dtype: float64

نلاحظ ان امريكا هيا الاعلي شراء بنسبه 18.1% و بمبلغ 247,254 الف

city vs totalamount top 10¶

In [52]:
plots("city", True)
describe of totalamount  count        69.00
mean     18,914.15
std      22,692.99
min         357.00
25%       4,788.06
50%      11,830.10
75%      23,850.95
max     113,236.68
Name: totalamount, dtype: float64

نلاحظ ان:

اعلي مدينه اشترت بضاعه هيا جراز بنسبه 18.1% بمجموع مبيعات 113.236 بينما لاندن تظهر 5 في المشتريات بنسبه 8.71 بمبلغ مشتريات 54.4702 الف بينما كنت هيا الاوله في عدد الفوتير

Bivariate analysis P2¶

orderdate & totalamount¶

In [54]:
orders.head()
Out[54]:
customerid firstname lastname city country orderid orderdate ordernumber totalamount
0 85 Paul Henriot Reims France 1 2012-07-04 542378 440.00
3 79 Karin Josephs Münster Germany 2 2012-07-05 542379 1,863.40
5 34 Mario Pontes Rio de Janeiro Brazil 3 2012-07-08 542380 1,813.00
8 84 Mary Saveley Lyon France 4 2012-07-08 542381 670.80
11 76 Pascale Cartrain Charleroi Belgium 5 2012-07-09 542382 3,730.00
In [58]:
px.line(data_frame = date_total , y="totalamount" , x="orderdate") 
C:\Users\dell\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning:

The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

customers vs totalamount¶

In [61]:
orders.head(5)
Out[61]:
customerid firstname lastname city country orderid orderdate ordernumber totalamount
0 85 Paul Henriot Reims France 1 2012-07-04 542378 440.00
3 79 Karin Josephs Münster Germany 2 2012-07-05 542379 1,863.40
5 34 Mario Pontes Rio de Janeiro Brazil 3 2012-07-08 542380 1,813.00
8 84 Mary Saveley Lyon France 4 2012-07-08 542381 670.80
11 76 Pascale Cartrain Charleroi Belgium 5 2012-07-09 542382 3,730.00
In [63]:
cust_orders={
    "full_name": orders.firstname  + " "  + orders.lastname,
    "totalamount": orders.totalamount
}

cust_orders=pd.DataFrame(cust_orders)
cust_orders.head(10)
Out[63]:
full_name totalamount
0 Paul Henriot 440.00
3 Karin Josephs 1,863.40
5 Mario Pontes 1,813.00
8 Mary Saveley 670.80
11 Pascale Cartrain 3,730.00
14 Mario Pontes 1,444.80
17 Yang Wang 625.20
20 Michael Holz 2,490.50
24 Paula Parente 517.80
26 Carlos Hern?ndez 1,119.90
In [65]:
#top 10 cust count orders
fig=px.bar(cust_orders.groupby("full_name")["totalamount"].count().sort_values(ascending=False).head(10),text_auto=True ,
           labels={"full_name":"full_name","value":"frequency" })
In [67]:
fig.update_traces(marker_color= "rgb(158,202,225)" ,marker_line_color="rgb(8,84,107)",opacity=0.6,textposition="outside")
In [69]:
#top sum cust orders 
fig=px.bar(cust_orders.groupby("full_name")["totalamount"].sum().sort_values(ascending=False).head(10),text_auto=True ,
           labels={"full_name":"full_name","value":"frequency" })
In [71]:
fig.update_traces(marker_color= "rgb(158,250,255)" ,marker_line_color="rgb(8,90,107)",opacity=0.6,textposition="inside")

multivariate analysis¶

In [74]:
orders.head()
Out[74]:
customerid firstname lastname city country orderid orderdate ordernumber totalamount
0 85 Paul Henriot Reims France 1 2012-07-04 542378 440.00
3 79 Karin Josephs Münster Germany 2 2012-07-05 542379 1,863.40
5 34 Mario Pontes Rio de Janeiro Brazil 3 2012-07-08 542380 1,813.00
8 84 Mary Saveley Lyon France 4 2012-07-08 542381 670.80
11 76 Pascale Cartrain Charleroi Belgium 5 2012-07-09 542382 3,730.00

country with orderdate and totalamount

we interested in nowing the number of countries and the totala amount of them month¶

In [78]:
multivar = orders[["orderdate" , "country" , "totalamount"]]
multivar = multivar.set_index("orderdate")
multivar.head()
Out[78]:
country totalamount
orderdate
2012-07-04 France 440.00
2012-07-05 Germany 1,863.40
2012-07-08 Brazil 1,813.00
2012-07-08 France 670.80
2012-07-09 Belgium 3,730.00
In [80]:
multivar = multivar.resample("1M").agg({'country': "nunique" ,"totalamount":"sum"}) #aggrygat funciton()
multivar.head()
Out[80]:
country totalamount
orderdate
2012-07-31 11 30,192.10
2012-08-31 11 26,609.40
2012-09-30 11 27,636.00
2012-10-31 13 41,203.60
2012-11-30 11 49,704.00
In [82]:
fig = px.bar (data_frame=multivar,x = multivar.index , y=round(multivar.totalamount,0), color =multivar.country ,text_auto=True , title="grand total amount of all countries per month" , labels={"orderdate":"order date" ,"y" :"totalamount"})
C:\Users\dell\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning:

The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

In [84]:
fig
In [ ]: